"""
获取小说语料
"""

"""读取全本小说"""
with open('../novel.txt', encoding='gbk') as f:
    i = 0 # 章节号计数器
    chs = {}
    for line in f:
        line = line.strip()

        # 跳过空行
        if not line:
            continue

        # 判断章节标题 第一回 第二十一回
        if line[0] == '第' and '回' in line[:5] and len(line) < 30:
            i += 1
        elif "附录一：成吉思汗家族" in line[:20]:
            break

        chs[i] = chs.get(i, '') + ' ' + line


# 删除第一回前面的内容
del chs[0]
# print(len(chs)) # 40
print(chs[1])